Preparing Environement

In [82]:
# Load the "autoreload" extension
%load_ext autoreload
# always reload modules marked with "%aimport"
%autoreload 1

import os
import sys
# add the 'src' directory as one where we can import modules
root_dir = os.path.join(os.getcwd(),os.pardir,os.pardir)
src_dir = os.path.join(root_dir, 'src')
if src_dir not in sys.path: sys.path.append(src_dir)
    
import math
import copy as cp
from datetime import datetime

import numpy as np
import pandas as pd

import matplotlib.style
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.decomposition import PCA
from sklearn.cluster import KMeans,Birch,AgglomerativeClustering
from sklearn.manifold import TSNE
from sklearn import preprocessing

from scipy.cluster import hierarchy
from scipy import stats
from scipy.stats import mstats

import helpers as hlp
%aimport helpers
from external import kMedoids
from IPython.display import display

#printing
pd.options.display.float_format = '{:,.2f}'.format
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload

Load preprocessed data

In [83]:
raw_path = os.path.join(root_dir,"data\\raw\\")
interim_path = os.path.join(root_dir,"data\\interim\\") 
processed_path = os.path.join(root_dir,"data\\processed\\") 

reports_path = os.path.join(root_dir,"reports\\")
models_path = os.path.join(root_dir,"models\\")

raw_file_name ="bnd_product_p2c4_raw.csv"
clean_file_name = "bnd_product_p2c4_clean.csv"
z_file_name ="bnd_product_z_p2c4_clean.csv"

row_headers = ['Product','Client']
n_row_headers = len(row_headers)



product_raw_df = pd.read_csv(interim_path+raw_file_name, sep=';', encoding='utf-8')
product_df = pd.read_csv(interim_path+clean_file_name, sep=';', encoding='utf-8')
product_df_full = pd.read_csv(processed_path+z_file_name, sep=';', encoding='utf-8')

X_z = product_df_full.values[:,n_row_headers:]



nb_col = X_z.shape[1]
X_pca = PCA(n_components = nb_col).fit_transform(X_z)


product_df_full.head()
Out[83]:
Product Client 2016-01-24 00:00:00 2016-01-31 00:00:00 2016-02-07 00:00:00 2016-02-14 00:00:00 2016-02-21 00:00:00 2016-02-28 00:00:00 2016-03-06 00:00:00 2016-03-13 00:00:00 ... 2017-09-24 00:00:00 2017-10-01 00:00:00 2017-10-08 00:00:00 2017-10-15 00:00:00 2017-10-22 00:00:00 2017-10-29 00:00:00 2017-11-05 00:00:00 2017-11-12 00:00:00 2017-11-19 00:00:00 2017-11-26 00:00:00
0 GBA001BND060SS 68A139 1.61 1.61 1.61 1.61 1.61 1.61 1.61 1.61 ... -1.00 -1.26 -1.26 -1.26 -1.26 -1.17 -1.26 -0.96 -0.88 -1.26
1 GBA001BND060SS 68C120 0.51 0.30 -0.40 -0.91 -1.01 -1.10 -1.10 -0.55 ... 1.08 0.06 0.99 1.76 1.80 1.71 0.60 -0.98 -1.26 -1.26
2 GBA001BND060SS 68C122 -0.15 -1.04 -1.04 -1.04 -0.96 -0.52 0.48 1.66 ... -0.45 -0.33 -0.22 -0.15 -0.22 -0.22 -0.30 -0.63 -0.89 -1.04
3 GBA001BND060SS 68C126 -0.11 0.65 0.65 0.97 1.16 0.71 0.90 1.03 ... -1.37 -1.43 -1.43 -1.43 -1.43 -1.43 -1.24 -1.43 -1.43 -1.43
4 GBA001BND060SS 68C129 -0.35 -0.35 0.33 1.25 0.79 0.79 0.33 -0.35 ... 0.56 1.25 0.79 1.02 1.25 0.79 -0.35 -1.95 -1.95 -1.95

5 rows × 99 columns

3. Modeling - Clustering Algorithms

Try out Hierarchical clustering, kMeans and kMedodis on raw (cleaned) data. Then, plot the PCA to visualize the result of the clustering on the principal components

Agglomerative - Automated Cut-Off Selection

In [84]:
from scipy.cluster import hierarchy

SSE = {}
SILOUHAITE = {}

Z = hierarchy.linkage(X_z, method='complete',metric='euclidean')
dn = hierarchy.dendrogram(Z, truncate_mode='lastp', p=50, leaf_rotation=90., leaf_font_size=7., show_contracted=True)
plt.show()



plt.figure()
labels_h_cc = hierarchy.fcluster(Z, t=100 ,criterion = 'maxclust')
hlp.Clusters_plot(X= X_pca, labels = labels_h_cc,info=["AgglomerativeClustering","Complete - euclidean ","%d clusters"%len(set(labels_h_cc))])


SSE["Agg_complete"] = hlp.getSSE(X_z,X_z[labels_h_cc])
<matplotlib.figure.Figure at 0x295366a0>
In [85]:
last = Z[-150:, 2]
last_rev = last[::-1]
idxs = np.arange(1, len(last) + 1)
plt.plot(idxs, last_rev)

acceleration = np.diff(last, 2)  # 2nd derivative of the distances
acceleration_rev = acceleration[::-1]
plt.plot(idxs[:-2] + 1, acceleration_rev)
plt.xticks(np.arange(1,len(last)))
plt.show()
best_ks = np.abs(acceleration_rev).argsort()[::-1]
k =  best_ks+ 2  # if idx 0 is the max of this we want 2 clusters
print ("clusters:", k)
clusters: [  5   3   9  40  39   8  70  12   2   6   7  71  27  11  41  29  50  42
  51  60  55  30  59  32   4 119  19  77  76  20  97  96 147  80  26  17
  21 102 118 146 140  68 139  81 121  33 144  93  45  95 136  14 120  52
  53  92  47 115 124 145  98  23 123 101  48 126  31  37  49  72  67  54
  36 132  85  91 128 131  46  61 133  73  58  75  18  86  65  82  74 111
 117  64 127  69  43 148  28  87  90 135 138  44 129  35 103 110  56  38
 149 142 141 109  10  99  94  78 143  62  22  15  57 107 122  34  84 116
 106 104  63  89 137  25  79 130 100 114 108 113 112  13  16 134 105 125
  24  83  66  88]

K-Means: Validate different numbers of clusters

In [33]:
%matplotlib inline
clusters= np.linspace(70,120,25).astype(int)
inertia = []
silouhaite = []
for cluster in clusters:
    kmeans = KMeans(n_clusters=cluster).fit(X_z)
    silouhaite.append(hlp.getSilouhaite(X_z,kmeans.labels_))
    inertia += [np.sqrt(kmeans.inertia_/len(kmeans.labels_))]

plt.figure(figsize=(16,4))
    
plt.subplot(1,2,1)
inertia = np.array(inertia)
plt.title("Total inertia according to clusters")    
plt.plot(np.arange(0,len(clusters)),inertia)#scale it to acc2
plt.xticks(np.arange(0,len(clusters)),clusters)


acc = np.diff(inertia, 2)  # 2nd derivative of the inertia curve
#plt.plot(np.arange(2,len(clusters)), acc)
best_ks = acc.argsort()[::-1]
k =  best_ks+ 2  # if idx 0 is the max of this we want 2 clusters
print ("clusters:",clusters[k])



plt.subplot(1,2,2)
silouhaite = np.array(silouhaite)
plt.title("Silouhaite score according to clusters")    
plt.plot(np.arange(0,len(clusters)),silouhaite)#scale it to acc2
plt.xticks(np.arange(0,len(clusters)),clusters)
best_ks = silouhaite.argsort()[::-1]
print("clusters:",clusters[best_ks])


plt.show()
clusters: [113  76  86  90  99 103  82 117 105  80 109 115  95  97 107 101  88  74
  92  78 111  84 120]
clusters: [ 97 120 111  86  74 107 109  95 113  78  80 103  76  99  72  90 105  92
 101  88  70 117  84  82 115]

Ward Clustering

In [38]:
n_cluster = 113

from sklearn.cluster import AgglomerativeClustering

ward = AgglomerativeClustering(n_clusters=n_cluster, linkage='ward').fit(X_z)
label = ward.labels_

SSE['Ward'] = hlp.getSSE(X_z,X_z[label])
print(SSE['Ward'])
hlp.Clusters_plot(X= X_pca, labels = label,info=["AgglomerativeClustering","Ward","%d clusters"%len(set(label))])
497878.7614838429

K-means

In [42]:
%matplotlib inline

kmeans = KMeans(n_clusters=n_cluster).fit(X_z)
label = kmeans.labels_
labels_kmeans = label

SSE["kMeans"] = hlp.getSSE(X_z,X_z[labels_kmeans])

PCA representation of the clustering

In [21]:
X = X_pca[:,:3]
hlp.Clusters_plot(X= X_pca, labels = label,info=["K-Means","Euclidean","%d clusters"%len(set(label))])

Custom Distances

In [86]:
from scipy.stats import spearmanr
def spearmanr_dist(x,y):
    rho, pval = spearmanr(x,y)
    return rho


r,p = spearmanr(X_z)
np.fill_diagonal(r,0)
In [87]:
from external import kMedoids
from scipy.spatial.distance import pdist,squareform

n_obs = X_z.shape[1]
corr_distance = squareform(pdist(X_z, 'correlation'))
euclid_distance = squareform(pdist(X_z, 'euclidean'))
sqcorr_distance = corr_distance**2
#spearman_distance = squareform(pdist(X_z, lambda u, v: spearmanr_dist(u,v)))

K-medoid: validate number of clusters using silouhaite

In [91]:
%matplotlib inline
clusters= np.linspace(70,115,25).astype(int)
silouhaite = []
inertia = []
for cluster in clusters:
    labels, medoids = kMedoids.cluster(euclid_distance,k= cluster)
    silouhaite.append(hlp.getSilouhaite(X_z,labels))
    sse = hlp.getSSE(X_z,X_z[labels])
    inertia.append(np.sqrt(sse/len(labels_kmedoids)))
    

plt.figure(figsize=(16,4))
    
plt.subplot(1,2,1)
inertia = np.array(inertia)
plt.title("Total inertia according to clusters")    
plt.plot(np.arange(0,len(clusters)),inertia)#scale it to acc2
plt.xticks(np.arange(0,len(clusters)),clusters)


acc = np.diff(inertia, 2)  # 2nd derivative of the inertia curve
#plt.plot(np.arange(2,len(clusters)), acc)
best_ks = acc.argsort()[::-1]
k =  best_ks+ 2  # if idx 0 is the max of this we want 2 clusters
print ("clusters:",clusters[k])



plt.subplot(1,2,2)
silouhaite = np.array(silouhaite)
plt.title("Silouhaite score according to clusters")    
plt.plot(np.arange(0,len(clusters)),silouhaite)#scale it to acc2
plt.xticks(np.arange(0,len(clusters)),clusters)
best_ks = silouhaite.argsort()[::-1]
print("clusters:",clusters[best_ks])
[ 86  96  81 111 107 103  70 101 100  85  75  92  83  88  71 109  90 105
  94 113 115  79  77  73  98]
clusters: [ 75  96 100  79 115  81  86  92 111 105 109  90 103 107  88  85 101 113
  73  83  94  98  77]
clusters: [ 86  96  81 111 107 103  70 101 100  85  75  92  83  88  71 109  90 105
  94 113 115  79  77  73  98]

K-medoids Clustering

In [94]:
n_cluster = 113
label, medoids_euc = kMedoids.cluster(euclid_distance,k= n_cluster)
labels_kmedoids = label

labels_kmedoids_corr,medoids_corr = kMedoids.cluster(corr_distance,k= n_cluster)
labels_kmedoids_spear,medoids_spear = kMedoids.cluster(corr_distance,k= n_cluster)


SSE["kMedoids"] = hlp.getSSE(X_z,X_z[labels_kmedoids])
SSE["kMedoids_corr"] = hlp.getSSE(X_z,X_z[labels_kmedoids_corr])
SSE["kMedoids_spear"] = hlp.getSSE(X_z,X_z[labels_kmedoids_spear])


SILOUHAITE["kMedoids"] = hlp.getSilouhaite(X_z,labels_kmedoids)
SILOUHAITE["kMedoids_corr"] = hlp.getSilouhaite(X_z,labels_kmedoids_corr)
SILOUHAITE["kMedoids_spear"] = hlp.getSilouhaite(X_z,labels_kmedoids_spear)


hlp.Clusters_plot(X= X_pca, labels = label
                  ,info=["PCA K-Medoids","Euclidienne","%d clusters :inertia %.2f"%(len(set(label)),SSE["kMedoids"])])
#hlp.Clusters_plot(X= X_tsne, labels = label,info=["TSNE K-Medoids","Correlation","%d clusters"%len(set(label))])

t-SNE representation of the clustering

In [95]:
X_tsne = TSNE(n_components = 3).fit_transform(X_z)

plt.figure(figsize=(14,6))
colors = [str(item/255.) for item in labels_kmedoids]
plt.suptitle("Total inertia %.02f"%kmeans.inertia_)  
plt.subplot(1,2,1)
plt.scatter(X_tsne[:,0],X_tsne[:,1],cmap ="Paired" ,c=colors,s=20)
plt.subplot(1,2,2)
plt.scatter(X_tsne[:,0],X_tsne[:,2],cmap ="Paired" ,c=colors,s=20)
plt.show(block = True)

Clustering methods Metrics

In [96]:
for k,v in SSE.items():
    print(" \"%s\" : %.2f"%(k,v))

print()
    
for k,v in SILOUHAITE.items():
    print(" \"%s\" : %.2f"%(k,v))
 "kMedoids" : 177352.23
 "Agg_complete" : 521110.11
 "kMedoids_corr" : 176833.41
 "kMedoids_spear" : 177202.34

 "kMedoids" : 0.03
 "kMedoids_corr" : 0.02
 "kMedoids_spear" : 0.02

Save Clustering Results

In [97]:
processed_path = "..\\data\\processed\\"
file_name = "p2c4_clustering_clean.csv"

version = 6

file_name = "p2c4_clustering_clean_week_v%d.csv"%version


def labels_to_df(labels):
    medoid_cluster_dict = dict()
    
    medoids = list(set(labels))
    for i,l in enumerate(medoids):
        medoid_cluster_dict[l] = i+1

    pd_tuples_list = list(product_df_full[row_headers].itertuples(index=False))
    headers_list = [tuple(x) for x in pd_tuples_list]
    
    rows=[]
    for i,h in enumerate(headers_list):
        m = labels[i]
        rows.append([h[0],h[1],medoid_cluster_dict[m],"%s"%(headers_list[m],)])


    label_df = pd.DataFrame(rows,columns = row_headers + ["Cluster","Centroid"])
    return label_df




eucl_df = labels_to_df(labels_kmedoids)
corr_df = labels_to_df(labels_kmedoids_corr)
spear_df = labels_to_df(labels_kmedoids_spear)


eucl_df.to_csv(models_path+"euc_"+file_name, sep=';', encoding='utf-8')
corr_df.to_csv(models_path+"corr_"+file_name, sep=';', encoding='utf-8')
spear_df.to_csv(models_path+"spear_"+file_name, sep=';', encoding='utf-8')


print(eucl_df.shape)
eucl_df.head()
(2846, 4)
Out[97]:
Product Client Cluster Centroid
0 GBA001BND060SS 68A139 79 ('GWQ001BND100FS', '68S004')
1 GBA001BND060SS 68C120 85 ('GWS074BND125FS', '68E103')
2 GBA001BND060SS 68C122 28 ('GWC012BND180FS', '68X077')
3 GBA001BND060SS 68C126 7 ('GBA001BND060SS', '68X080')
4 GBA001BND060SS 68C129 22 ('GWM007BND090SS', '68X077')

Display Clustering Results

In [98]:
carr = hlp.Cluster_series_plot(data_df = product_df_full, cluster_df = eucl_df,headers = row_headers)
In [ ]:
carr = hlp.Cluster_series_plot(data_df = product_df_full, cluster_df = corr_df,headers = row_headers)
In [ ]:
carr = hlp.Cluster_series_plot(data_df = product_df_full, cluster_df = spear_df,headers = row_headers)
In [100]:
carr = hlp.Cluster_series_plot(data_df = product_df_full, cluster_df = eucl_df,headers = row_headers,centroid_only = True)
#
#

Other Methods

BIRCH Algorithm

In [ ]:
label = Birch(n_clusters= n_cluster, threshold=0.5, compute_labels=True).fit_predict(X_z)
labels_birch = label

SSE["Birch"] = hlp.getSSE(X_z,X_z[labels_birch])

hlp.Clusters_plot(X= X_pca, labels = label,info=["PCA BIRCH","(50, 0.5)","%d clusters"%len(set(label))])

SOM

In [ ]:
from minisom import MiniSom   


som = MiniSom(14, 14, 104, sigma=0.5, learning_rate=0.01) # initialization of 6x6 SOM

som.train_batch(X_z, 1000) # trains the SOM with 100 iterations

# Plotting the response for each pattern in the iris dataset
plt.bone()
plt.pcolor(som.distance_map().T)  # plotting the distance map as background
plt.colorbar()

qnt = som.quantization(X_z)
x,y = som.winner(X_z[0])


plt.show()

SOM with SOMpy

In [1]:
import sompy
mapsize = [14,14]
som = sompy.SOMFactory().build(Data1, mapsize, mask=None, mapshape='planar', lattice='rect', normalization='var', initialization='pca', neighborhood='gaussian', training='batch', name='sompy')  
# this will use the default parameters, but i can change the initialization and neighborhood methods
som.train(n_job=1, verbose='info')  # verbose='debug' will print more, and verbose=None wont print anything
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-1-3759cf310b07> in <module>()
      1 import sompy
      2 mapsize = [14,14]
----> 3 som = sompy.SOMFactory().build(Data1, mapsize, mask=None, mapshape='planar', lattice='rect', normalization='var', initialization='pca', neighborhood='gaussian', training='batch', name='sompy')
      4 # this will use the default parameters, but i can change the initialization and neighborhood methods
      5 som.train(n_job=1, verbose='info')  # verbose='debug' will print more, and verbose=None wont print anything

AttributeError: 'module' object has no attribute 'SOMFactory'

Fuzzy c-means

In [ ]:
import skfuzzy as fuzz

ncenters = n_cluster
cntr, u, u0, d, jm, p, fpc = fuzz.cluster.cmeans(
        X_z.T, ncenters, 2, error=0.005, maxiter=1000, init=None)



labels_fuzzy = np.argmax(u, axis=0)

Representation methods

PAA_SAX representation

In [ ]:
from pyts.transformation import PAA,SAX
from pyts.visualization import plot_paa, plot_sax
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
ts = X_z[0].reshape(1,-1)

paa_win = 100
sax_bin = 30


#PAA example
paa = PAA(window_size=None, output_size=paa_win, overlapping=True)
X_paa = paa.transform(ts)
plot_paa(ts[0], window_size=None, output_size=paa_win, overlapping=True, marker='o')

#SAX example
sax = SAX(n_bins=sax_bin, quantiles='gaussian')
X_sax = sax.transform(X_paa)
plot_sax(X_paa[0], n_bins=sax_bin, quantiles='gaussian')
plt.show()

#Extract SAX vector
X_SAX= []
for x in X_z:
    ts = x.reshape(1,-1)
    paa = PAA(window_size=None, output_size=paa_win, overlapping=True)
    X_paa = paa.transform(ts)
    sax = SAX(n_bins=sax_bin, quantiles='gaussian')
    X_sax = sax.transform(X_paa)[0]
    X_SAX.append(list(X_sax))

#Cast back to int
X_SAX = np.array(X_SAX).astype('|S1')
X_SAX = X_SAX.view(np.uint8) - 98
# X_z = X_SAX
# N,M  = X_z.shape
In [125]:
tuple([1,2,3])
Out[125]:
(1, 2, 3)